#######################################################
###           Set up R session                      ###
#######################################################

### Prior to loading, need to install work packages
# Call in R packages
library(TraMineR)
library(cluster)
library(ggseqplot)

# Call in the 'biofam' data set which is included with the TraMineR package
data(biofam)

#######################################################
###   Step 1: Creating a sequence object            ###
#######################################################

# Create a basic sequence object
basic.sequence.object = seqdef(biofam, var=10:25)

### Create lists of labels for more complex sequence object
# Create list of long labels for family formation states
labels.biofam = c("Parents", "Left", "Married", "Left/Married", "Child", "Left/Child", "Left/Married/Child", "Divorced")

# Create list of short labels for family formation states
states.biofam = c("P","L","M","LM","C","LC", "LMC", "D")

# Create sequence object that includes new labeling and colour pallet
complex.sequence.object = seqdef(biofam, var=10:25, states = states.biofam, labels = labels.biofam)

#######################################################
###   Step 2: Creating a distance matrix            ###
#######################################################

### Calculate distances between sequences
# Use Hamming Method (method="HAM")
# Only uses substitutions to align sequences
# Substitution costs based on transition rates between states (sm = "TRATE")
distance.matrix = seqdist(complex.sequence.object, method="HAM", sm = "TRATE")

#######################################################
###  Step 3: Cluster analysis of distance matrix    ###
#######################################################

### Cluster analysis of distance matrix
# Conduct hierarchical clustering using Ward's method
ward.hclust = hclust(as.dist(distance.matrix), method = "ward.D2")

# generate visualisation showing interlinking clusters (also known as 'dendrogram' or 'cluster tree')
plot(ward.hclust)

# 'Cut' the cluster-tree so that people are assigned to one of 5 clusters
# Saves the clusters as a separate R object which can be used as a grouping variable for visualisations in Step 4
# Change the value of k to extract a different number of clusters (i.e., k = 4 for 4-clusters)
cluster.solution = cutree(ward.hclust, k = 5)

# Attaches the cluster variable to the biofam data set as a new column
# Allows you to look at cross tabulations and regressions with variables not included in the sequence analysis
biofam = cbind(biofam, cluster.solution)

# Frequency table of number of people in each cluster (1 to 5)
table(cluster.solution)

# Proportion of total people contained within in each cluster (1 to 5)
prop.table(table(cluster.solution))

#######################################################
###  Step 4: Interpreting clusters  	   	    ###
#######################################################

# Mean-time spent in state plot by cluster
seqmtplot(complex.sequence.object, group = cluster.solution)

# State distribution plots by cluster
seqdplot(complex.sequence.object, group = cluster.solution)

# Medoid (most central) sequence by cluster
seqrplot(complex.sequence.object, group = cluster.solution, diss = distance.matrix, nrep = 1)

# Transition rate matrix for Cluster 1
# Change the number after 'cluster.solution==' to produce matrix for corresponding cluster
ggseqtrplot(complex.sequence.object[cluster.solution==1,], x_n.dodge = 2, dss = TRUE)